UK Police Crime Data

  • This notebook provides a way to visualise crime data provided by the UK police forces

Motivation

Imports

In [1]:
%matplotlib inline
import pandas as pd
import os
import getpass
import json
import seaborn
import matplotlib.pyplot as plt
from IPython.display import display
import geopandas as gpd
from shapely.geometry import Point
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

def pythonise_string(input_string):
    return input_string.lower().replace(' ', '_')

Mapbox Authentication

In [2]:
try:
    with open('authentication.json') as f:
        mapbox_access_token = json.load(f)['mapbox_access_token']
except:
    mapbox_access_token = getpass.getpass()

User Parameters

In [3]:
data_path = r'D:\uk_police_crime_data\police_data'
region = 'metropolitan'
data_type = 'street'
start_date = pd.Timestamp(year=2014, month=12, day=1)
end_date = pd.Timestamp(year=2019, month=5, day=1)

map_data_path = r'D:\uk_police_crime_data\map_data\statistical-gis-boundaries-london\ESRI\London_Borough_Excluding_MHW.shp'
crs = {'init': 'epsg:4326'}

Raw Data

In [5]:
def get_data_file_path(date, data_path=data_path, region=region, data_type=data_type):
    return os.path.join(data_path, f'{date:%Y-%m}', f'{date:%Y-%m}-{region}-{data_type}.csv')
    
raw_df = pd.concat([pd.read_csv(get_data_file_path(date)) for date in data_date_range], axis=0).reset_index(drop=True)
display(raw_df.head())
Crime ID Month Reported by Falls within Longitude Latitude Location LSOA code LSOA name Crime type Last outcome category Context
0 NaN 2014-12 Metropolitan Police Service Metropolitan Police Service 0.134947 51.588063 On or near Mead Grove E01000027 Barking and Dagenham 001A Anti-social behaviour NaN NaN
1 NaN 2014-12 Metropolitan Police Service Metropolitan Police Service 0.140035 51.589112 On or near Beansland Grove E01000027 Barking and Dagenham 001A Anti-social behaviour NaN NaN
2 d1fb841e44cc811cecf3c6744e3b7d0f1e3bd96b24a441... 2014-12 Metropolitan Police Service Metropolitan Police Service 0.140035 51.589112 On or near Beansland Grove E01000027 Barking and Dagenham 001A Burglary Investigation complete; no suspect identified NaN
3 48f91db00f9cb836ebd4b418ccaf490d2647a47fcd16ed... 2014-12 Metropolitan Police Service Metropolitan Police Service 0.135554 51.584985 On or near Rose Lane E01000027 Barking and Dagenham 001A Burglary Defendant found not guilty NaN
4 1cefdbd4841090ecfe5b481897497a1426b1c1dd503cfc... 2014-12 Metropolitan Police Service Metropolitan Police Service 0.133181 51.586719 On or near Rosehatch Avenue E01000027 Barking and Dagenham 001A Criminal damage and arson Status update unavailable NaN

Clean Data

In [6]:
def clean_data(input_df):
    df = input_df.copy()
    df.columns = [pythonise_string(column) for column in df.columns]
    df['date'] = pd.to_datetime(df['month'], format='%Y-%m')
    df[['longitude', 'latitude']] = df[['longitude', 'latitude']].round(4)
    df = df.loc[~(df['longitude'].isnull() | df['latitude'].isnull())]
    drop_columns = ['month', 'falls_within', 'reported_by', 'last_outcome_category', 'context', 'lsoa_code', 'lsoa_name', 
                    'crime_id']
    df = df.drop(columns=drop_columns).reset_index(drop=True)
    return df

clean_df = clean_data(raw_df)
display(clean_df['crime_type'].unique())
display(clean_df.head())
array(['Anti-social behaviour', 'Burglary', 'Criminal damage and arson',
       'Drugs', 'Other theft', 'Violence and sexual offences',
       'Other crime', 'Public order', 'Vehicle crime', 'Robbery',
       'Shoplifting', 'Theft from the person', 'Bicycle theft',
       'Possession of weapons'], dtype=object)
longitude latitude location crime_type date
0 0.1349 51.5881 On or near Mead Grove Anti-social behaviour 2014-12-01
1 0.1400 51.5891 On or near Beansland Grove Anti-social behaviour 2014-12-01
2 0.1400 51.5891 On or near Beansland Grove Burglary 2014-12-01
3 0.1356 51.5850 On or near Rose Lane Burglary 2014-12-01
4 0.1332 51.5867 On or near Rosehatch Avenue Criminal damage and arson 2014-12-01

Geopandas

  • This allows us to utilise geometric ideas such as points, polygons and SPATIAL JOINS!
In [7]:
gdf = clean_df.copy()
gdf['coordinate'] = list(zip(gdf['longitude'], gdf['latitude']))
gdf['coordinate'] = gdf['coordinate'].apply(Point)
gdf = gpd.GeoDataFrame(gdf, crs=crs, geometry='coordinate')
display(gdf.head())
display(gdf.crs)
longitude latitude location crime_type date coordinate
0 0.1349 51.5881 On or near Mead Grove Anti-social behaviour 2014-12-01 POINT (0.1349 51.5881)
1 0.1400 51.5891 On or near Beansland Grove Anti-social behaviour 2014-12-01 POINT (0.14 51.5891)
2 0.1400 51.5891 On or near Beansland Grove Burglary 2014-12-01 POINT (0.14 51.5891)
3 0.1356 51.5850 On or near Rose Lane Burglary 2014-12-01 POINT (0.1356 51.585)
4 0.1332 51.5867 On or near Rosehatch Avenue Criminal damage and arson 2014-12-01 POINT (0.1332 51.5867)
{'init': 'epsg:4326'}
In [8]:
map_df = gpd.read_file(map_data_path)
map_df.columns = [pythonise_string(column) for column in map_df.columns]
map_df = map_df.drop(columns=['gss_code', 'sub_2009', 'sub_2006', 'nonld_area', 'ons_inner'])
display(map_df.head())
display(map_df.crs)
name hectares geometry
0 Kingston upon Thames 3726.117 POLYGON ((516401.6 160201.8, 516407.3 160210.5...
1 Croydon 8649.441 POLYGON ((535009.2 159504.7, 535005.5 159502, ...
2 Bromley 15013.487 POLYGON ((540373.6 157530.4, 540361.2 157551.9...
3 Hounslow 5658.541 POLYGON ((521975.8 178100, 521967.7 178096.8, ...
4 Ealing 5554.428 POLYGON ((510253.5 182881.6, 510249.9 182886, ...
{'proj': 'tmerc',
 'lat_0': 49,
 'lon_0': -2,
 'k': 0.999601272,
 'x_0': 400000,
 'y_0': -100000,
 'datum': 'OSGB36',
 'units': 'm',
 'no_defs': True}
In [9]:
map_df = map_df.to_crs({'proj': 'latlong', 'ellps': 'WGS84', 'datum': 'WGS84', 'no_defs': True})
# It is probably not a good thing to override the crs...
map_df.crs = crs
display(map_df.head())
display(map_df.crs)
connected_df = gpd.sjoin(map_df, gdf, how="left", op='contains').reset_index(drop=True)
name hectares geometry
0 Kingston upon Thames 3726.117 POLYGON ((-0.3306790629424527 51.3290110106029...
1 Croydon 8649.441 POLYGON ((-0.06402123962011302 51.318637659874...
2 Bromley 15013.487 POLYGON ((0.0121309385091763 51.29959905965642...
3 Hounslow 5658.541 POLYGON ((-0.2445623945250543 51.4887021763399...
4 Ealing 5554.428 POLYGON ((-0.4118326897314672 51.5340838625760...
{'init': 'epsg:4326'}

Visualise using Matplotlib

In [10]:
fig, ax = plt.subplots(1, figsize=(10, 6))
map_df.plot(ax=ax)
gdf.loc[connected_df['index_right']].plot(ax=ax, marker='o', color='red', alpha=0.1, markersize=0.1)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x26da1c5c9b0>

Visualise using Plotly and Mapbox

In [11]:
count_column_name = 'crime_type'
groupby_columns = ['longitude', 'latitude', 'crime_type']
remaining_column = ['location']


crime_df = clean_df.loc[connected_df['index_right']].groupby(groupby_columns)[remaining_column].count() \
    .unstack('crime_type') \
    .fillna(0) \
    .astype('int64')
crime_df.columns = crime_df.columns.droplevel(0)
crime_df['Total'] = crime_df.sum(axis=1)

display(crime_df.head())
crime_type Anti-social behaviour Bicycle theft Burglary Criminal damage and arson Drugs Other crime Other theft Possession of weapons Public order Robbery Shoplifting Theft from the person Vehicle crime Violence and sexual offences Total
longitude latitude
-0.5076 51.4687 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2
-0.5051 51.4745 3 0 0 0 0 0 0 0 0 0 0 0 0 0 3
-0.4977 51.6116 5 0 5 0 0 0 0 0 0 0 0 0 0 1 11
-0.4974 51.6066 1 0 6 0 0 0 0 0 0 0 0 0 4 2 13
-0.4969 51.6138 0 0 2 2 0 0 2 0 0 0 0 0 2 0 8
In [21]:
plot_df = crime_df[['Total']].rename(columns={'Total': 'count'})
plot_df['text'] = crime_df.apply(
    lambda row: str({key: f'{value}<br>' for key, value in row.iteritems() if value != 0}), 
    axis=1,
    ).str.replace("'|, |{|}", '')
plot_df = plot_df.reset_index()
plot_df['percentile'] = plot_df['count'].rank(pct=True)

data = [
    go.Scattermapbox(
        lat=plot_df['latitude'],
        lon=plot_df['longitude'],
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=7,
            opacity=.5,
            color=1-plot_df['percentile'],
            colorscale='YlOrRd',
            ),
        text=plot_df['text'],
        )
    ]

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    mapbox=go.layout.Mapbox(
        accesstoken=mapbox_access_token,
        bearing=0,
        style='streets',
        center=go.layout.mapbox.Center(
            lat=51.509865,
            lon=-0.118092,
            ),
        pitch=0,
        zoom=8,
        ),
    )

fig = go.Figure(data=data, layout=layout)
In [22]:
iplot(fig)